# abundance of sRNA species
### Select only for 21,22 and 24
species_srna = function(path, sample_name, sex_id, tissue_id) {
table = data.table(read.table(path))
counts = as.data.frame(table[, .N, by='V2']) %>% mutate(sample = sample_name, sex = sex_id, tissue = tissue_id)
abundance = as.data.frame(table[, .N, by='V1']) %>% mutate(sample = sample_name, sex = sex_id, tissue = tissue_id) %>% filter(N > 2)
list(table, counts, abundance)
}
### FEMALE SAMPLES
f1l=species_srna("raw/sRNA_MGX/trimmed/F1L_final_trimming.fastq_count.tsv", "F1L", "Female", "Leaf")
f1b=species_srna("raw/sRNA_MGX/trimmed/F1B_final_trimming.fastq_count.tsv", "F1B", "Female", "Flower bud")
f2l=species_srna("raw/sRNA_MGX/trimmed/F2L_final_trimming.fastq_count.tsv", "F2L", "Female", "Leaf")
f2b=species_srna("raw/sRNA_MGX/trimmed/F2B_final_trimming.fastq_count.tsv", "F2B", "Female", "Flower bud")
f3l=species_srna("raw/sRNA_MGX/trimmed/F3L_final_trimming.fastq_count.tsv", "F3L", "Female", "Leaf")
f3b=species_srna("raw/sRNA_MGX/trimmed/F3B_final_trimming.fastq_count.tsv", "F3B", "Female", "Flower bud")
### MALE SAMPLES
m1l=species_srna("raw/sRNA_MGX/trimmed/M1L_final_trimming.fastq_count.tsv", "M1L", "Male", "Leaf")
m1b=species_srna("raw/sRNA_MGX/trimmed/M1B_final_trimming.fastq_count.tsv", "M1B", "Male", "Flower bud")
m2l=species_srna("raw/sRNA_MGX/trimmed/M2L_final_trimming.fastq_count.tsv", "M2L", "Male", "Leaf")
m2b=species_srna("raw/sRNA_MGX/trimmed/M2B_final_trimming.fastq_count.tsv", "M2B", "Male", "Flower bud")
m4l=species_srna("raw/sRNA_MGX/trimmed/M4L_final_trimming.fastq_count.tsv", "M4L", "Male", "Leaf")
m4b=species_srna("raw/sRNA_MGX/trimmed/M4B_final_trimming.fastq_count.tsv", "M4B", "Male", "Flower bud")
### Species abundance
spp = do.call(rbind, lapply(list(f1l[[3]],
f1b[[3]],
f2l[[3]],
f2b[[3]],
f3l[[3]],
f3b[[3]],
m1l[[3]],
m1b[[3]],
m2l[[3]],
m2b[[3]],
m4l[[3]],
m4b[[3]]), as.data.frame
) )
rm(f1b, f1l, f2b, f2l, f3b, f3l, m1b, m1l, m2b, m2l, m4b, m4l)
colnames(spp) = c("sRNA", "counts", "sample", "sex", "tissue")
spp = spp %>% mutate(log_counts = log2(counts))
#### Histogram
ggplot(spp) +
geom_density(aes(x=log_counts, color=sample), alpha=0.6) +
facet_grid(tissue ~ sex) +
scale_color_manual(values = met.brewer("Hiroshige", 12)) +
labs(y="Density", x="log2(Expression)")
ggsave("results/sp_expression.png", width = 6, height = 5, dpi = 600)
### With 320880141 total reads and mincov of 1 reads per million, the min read depth is 321
### For RPM > 1
no_uniq = spp[spp$counts>321, ]
ggplot(no_uniq) +
geom_density(aes(x=log2(counts), color=sample), alpha=0.6) +
facet_grid(tissue ~ sex) +
scale_color_manual(values = met.brewer("Hiroshige", 12)) +
labs(y="Density", x="log2(Expression)")
ggsave("results/reduced_sp_expression.png", width = 6, height = 5, dpi = 600)
lgt = do.call(rbind, lapply(list(f1l[[2]],
f1b[[2]],
f2l[[2]],
f2b[[2]],
f3l[[2]],
f3b[[2]],
m1l[[2]],
m1b[[2]],
m2l[[2]],
m2b[[2]],
m4l[[2]],
m4b[[2]]), as.data.frame
) )
ggplot(lgt) +
geom_bar(aes(x=as.factor(V2), y=as.numeric(N), fill=sample), stat="identity", position = "dodge") +
facet_grid(tissue ~ sex) +
scale_fill_manual(values = met.brewer("Hiroshige", 12)) +
labs(x="sRNA Length", y="Count")
ggsave("results/length.png", width = 6, height = 5, dpi = 600)
fracs = read.table("raw/ShortStack_results/alignment_details.tsv", header = T)
fracs = filter(fracs, mapping_type == "P", count > 0)
fracs = cbind(data.frame(
sample = rep(c(rep(1,6),rep(2,6), rep(3,6)), 2),
sex = c(rep("Female", 18), rep("Male", 18)),
organ = rep(c(rep("Flower bud", 3), rep("Leaf", 3)), 6)
) , fracs)
fracs = fracs %>% select(sample, sex, organ, read_length, count)
ggplot(fracs) +
geom_bar(aes(fill=read_length, y=count, x=sample), position="fill", stat="identity") +
facet_grid(organ ~ sex) +
scale_fill_manual(values = met.brewer("Hokusai3")) +
labs(y="Percentage of aligned reads", x="Individual", fill="sRNA length") +
theme(text = element_text(size=20))
ggsave("results/fractions.png", width = 6, height = 6, dpi = 600)
rm(fracs)
# set boundaries
chr_sizes = read.table("chromSizes.txt")
colnames(chr_sizes) = c("chr", "end")
chr_sizes = chr_sizes %>% mutate(start = "1") %>% relocate(start, .before = end)
# get density of PTGS clusters
ptgs_bed = read.table("raw/only_21-22_known_de_novo_f-y/Results.gff3")[, c(1, 4:5)]
colnames(ptgs_bed) = c("chr", "start", "end")
# get density of PTGS clusters
rddm_bed = read.table("raw/only_24_f-y/Results.gff3")[, c(1, 4:5)]
colnames(rddm_bed) = c("chr", "start", "end")
# load gene annotation
gene_bed = read.table("annotation/genes.bed")
#repeats_bed = read.table("annotation/repeats.bed", sep = "\t")
tes_bed = read.table("annotation/tes.bed", sep = "\t")
tesorter_bed = read.table("tesorter_genome/tes_genome.dom.bed")
# plot densities
{
png(file="results/srna_density.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicDensity(ptgs_bed, col = c("#5EBF75"), track.height = 0.1, count_by = "number")
circos.genomicDensity(rddm_bed, col = c("#6276F6"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}
# with TEsortert annotations (smaller file)
{
png(file="results/densities.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tesorter_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicDensity(ptgs_bed, col = c("#5EBF75"), track.height = 0.1, count_by = "number")
circos.genomicDensity(rddm_bed, col = c("#6276F6"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}
#### Circos plot of sRNA mapping per 1mb windows
# create function
surco = function(toy, lib_size_1, lib_size_2, lib_size_3) {
# set column names
colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "chr_w", "start_w", "end_w", "wind")
# calculate mead depth per window, normalize by library size
toy[, mean_depth_1 := mean(depth_1)/lib_size_1, by=.(wind)]
toy[, mean_depth_2 := mean(depth_2)/lib_size_2, by=.(wind)]
toy[, mean_depth_3 := mean(depth_3)/lib_size_3, by=.(wind)]
# get one row per window
toy = unique(toy, by = "wind")[, .(chr_w, start_w, end_w, wind, mean_depth_1, mean_depth_2, mean_depth_3)]
# overall mean and multiply by 1m
toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
toy[, log_depth := log(mean_depth), by=1:nrow(toy)]
# print bed file + mean depth per million
toy[, .(chr_w, start_w, end_w, mean_depth, log_depth)]
}
# load files and do computations (library sizes are in the jupyter notebook)
females_depth_ptgs = fread("raw/depth/females_flowers_ptgs.depth")
females_depth_ptgs = surco(females_depth_ptgs, 54935002, 29689313, 25245689)
females_depth_rddm = fread("raw/depth/females_flowers_rddm.depth")
females_depth_rddm = surco(females_depth_rddm, 54935002, 29689313, 25245689)
males_depth_ptgs = fread("raw/depth/males_flowers_ptgs.depth")
males_depth_ptgs = surco(males_depth_ptgs, 13224600, 23943681, 23582102)
males_depth_rddm = fread("raw/depth/males_flowers_rddm.depth")
males_depth_rddm = surco(males_depth_rddm, 13224600, 23943681, 23582102)
# plot circos
{
png(file="results/srna_depth.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicHeatmap(males_depth_ptgs, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#2f8d45")))
circos.genomicHeatmap(males_depth_rddm, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#4453b1")))
circos.genomicHeatmap(females_depth_ptgs, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#2f8d45")))
circos.genomicHeatmap(females_depth_rddm, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#4453b1")))
circos.clear()
dev.off()
}
# Distribution of inverted repeats
inv_bed = read.table("results/inv_rep.bed"
{
png(file="results/inv_rep.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "percent")
circos.genomicDensity(inv_bed, col = c("blue"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}
### All together
exp_ptgs = read.table("raw/only_21-22_known_de_novo_f-y/Counts.txt", header = T)
transposed = t(cpm(exp_ptgs[, 4:15], log = T))
colnames(transposed) = exp_ptgs$Name
samples = data.frame(samples = rownames(transposed),
tissue = c(rep(c("Female, Flower bud", "Female, Leaf"), 3), rep(c("Male, Flower bud", "Male, Leaf"), 3)),
sex = c(rep("Female", 6), rep("Male", 6)),
organ = rep(c("Flower bud", "Leaf"), 6)
)
pca_exp = PCA(transposed, ncp = 20, graph = F)
pcadf = data.frame(samples,
pca1 = as.data.frame(pca_exp$ind)[,1],
pca2 = as.data.frame(pca_exp$ind)[,2])
ggplot(pcadf) +
geom_hline(yintercept = 0, color="gray40")+
geom_vline(xintercept = 0, color="gray40")+
geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
guides(fill = guide_legend(override.aes = list(size=8))) +
coord_fixed(ratio = 1.5) +
theme_bw(base_size = 20) +
theme(legend.position = c(0.8, 0.7),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.background = element_rect(fill = "gray"),
aspect.ratio=1)
ggsave("results/pca.png", width = 8, height = 6, dpi = 600)
rm(pcadf, pca_exp)
# sRNA clusters names will be kept as row names, remove columns with ids
matrix_ptgs = exp_ptgs[, 4:15]
row.names(matrix_ptgs) = exp_ptgs$Name
# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(matrix_ptgs)>0.5) >= 2
summary(keep) # 132 will be removed
## Mode FALSE TRUE
## logical 132 45260
matrix_ptgs = matrix_ptgs[keep,]
rm(keep)
# Prepare data info
datainfo = samples[, 2:4]
row.names(datainfo) = samples$samples # colnames in the expression matrix must match
pyramide = function(raw_matrix, data_info, design_formula, groups_vector){
######### DESEQ 2
# Generate the DESeqDataSet
deseq_object = DESeqDataSetFromMatrix(countData = raw_matrix, colData = data_info, design = design_formula)
# Run Analysis
deseq_object = DESeq(deseq_object)
# Get table
deseq_table = as.data.frame(results(deseq_object, pAdjustMethod = "BH"))
# Get DE clusters that fall under the selection threshold (pvalue < 0.001)
deseq = rownames(filter(deseq_table, pvalue<0.001 & {log2FoldChange>1 | log2FoldChange<(-1)} ) )
######## EDGER
# generate DGEList with grouping by species
edgeR.DGElist <- DGEList(counts = raw_matrix, group = groups_vector)
# normalize for RNA composition, scaling for the effective library size
edgeR.DGElist <- calcNormFactors(edgeR.DGElist)
# Estimation of dispersion
edgeR.DGElist <- estimateCommonDisp(edgeR.DGElist, verbose=T)
edgeR.DGElist <- estimateTagwiseDisp(edgeR.DGElist)
# ESTIMATION OF DISPERTION by GLM
design.mat <- model.matrix(~ groups_vector)
# perform DE testing
fit <- glmFit(edgeR.DGElist, design.mat)
lrt <- glmLRT(fit , coef = 2)
# extract results table
edger_table = topTags(lrt , n = Inf, sort.by = "PValue", adjust.method = "BH")$table
# Extract most differential expressed genes
edger = rownames(filter(edger_table, PValue<0.001 & {logFC>1 | logFC<(-1)} ))
######### LIMA VOOM
# Redefine object
LimmaVoom.DGElist <- edgeR.DGElist
# normalize for RNA composition, scaling for the effective library size
LimmaVoom.DGElist <- calcNormFactors(LimmaVoom.DGElist)
rownames(design.mat) <- colnames(LimmaVoom.DGElist) # add names to the design matrix from edgeR
# Transform count data to log2-counts per million (logCPM), estimate the mean-variance relationship and
# use this to compute appropriate observational-level weights. The data are then ready for linear modelling
voomTransformed <- voom(LimmaVoom.DGElist , design.mat , plot=F)
# fit a linear model for each gene
voomed.fitted <- lmFit(voomTransformed , design = design.mat)
# compute statistics
voomed.fitted <- eBayes(voomed.fitted)
# extract most differential expressed genes and table
lvoom_table = topTable(voomed.fitted, adjust="BH", number = Inf, sort.by="P")
lvoom = rownames(filter(lvoom_table, P.Value<0.001 & {logFC>1 | logFC<(-1)} ))
######## Get the clusters that were diferentially expressed in at least 2 methods
clusters = unique(c(intersect(deseq, edger), intersect(deseq, lvoom), intersect(edger, lvoom)))
####### Print results
list("clusters" = clusters,
"DeSeq2 table" = deseq_table,
"EdgeR table" = edger_table,
"Lima-Voom table" = lvoom_table)
}
# Select samples
fptgs = matrix_ptgs %>% select(grep('B', colnames(matrix_ptgs)))
fptgs_info = datainfo %>% filter(organ=="Flower bud")
# define groups (in the same order as the table)
fgroups = datainfo[datainfo$organ == "Flower bud", ]$sex
flowers = pyramide(fptgs, fptgs_info, ~sex, fgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.72015 , BCV = 0.8486
## Removing intercept from test coefficients
rm(fptgs, fptgs_info, fgroups)
# Select samples
lptgs = matrix_ptgs %>% select(grep('L', colnames(matrix_ptgs)))
lptgs_info = datainfo %>% filter(organ=="Leaf")
# define groups (in the same order as the table)
lgroups = datainfo[datainfo$organ == "Leaf", ]$sex
leaves = pyramide(lptgs, lptgs_info, ~sex, lgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.7999 , BCV = 0.8944
## Removing intercept from test coefficients
rm(lptgs, lptgs_info, lgroups)
# Select samples
feptgs = matrix_ptgs %>% select(grep('F', colnames(matrix_ptgs)))
feptgs_info = datainfo %>% filter(sex=="Female")
# define groups (in the same order as the table)
fegroups = datainfo[datainfo$sex == "Female", ]$organ
female = pyramide(feptgs, feptgs_info, ~organ, fegroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.66011 , BCV = 0.8125
## Removing intercept from test coefficients
rm(feptgs, feptgs_info, fegroups)
# Select samples
maptgs = matrix_ptgs %>% select(grep('M', colnames(matrix_ptgs)))
maptgs_info = datainfo %>% filter(sex=="Male")
# define groups (in the same order as the table)
magroups = datainfo[datainfo$sex == "Male", ]$organ
male = pyramide(maptgs, maptgs_info, ~organ, magroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.85829 , BCV = 0.9264
## Removing intercept from test coefficients
rm(maptgs, maptgs_info, magroups)
ptgs_list = list("Sex-biased in flower buds" = flowers[["clusters"]],
"Sex-biased in leaves" = leaves[["clusters"]],
"Tissue-biased in females" = female[["clusters"]],
"Tissue-biased in males" = male[["clusters"]])
upset(fromList(ptgs_list),
nintersects = 8,
mainbar.y.label = "Shared DE sRNA clusters",
sets.x.label = "DE sRNA clusters",
matrix.color = met.brewer("Egypt")[2],
main.bar.color = met.brewer("Egypt")[3],
sets.bar.color = met.brewer("Egypt")[4],
shade.color = met.brewer("Egypt")[2],
point.size = 10,
line.size = 4,
text.scale = c(3, 4, 3, 4, 5, 4),
mb.ratio = c(0.5, 0.5)
)
rm(ptgs_list)
# Sex-biased in flowers
baits_flowers = flowers[["clusters"]]
baits_flowers = as.data.frame(cpm(matrix_ptgs, log=TRUE)[baits_flowers, ] ) %>% select(grep('B', colnames(matrix_ptgs)))
baits_flowers = baits_flowers %>% mutate(cluster = rownames(baits_flowers)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1B_dicer, F2B_dicer, F3B_dicer)),
male_exp = mean(c(M1B_dicer, M2B_dicer, M4B_dicer)),
male_biased = male_exp>female_exp,
type = "Sex-biased sRNAs in flowers") %>% add_count(type)
# Sex-biased in leaves
baits_leaves = leaves[["clusters"]]
baits_leaves = as.data.frame(cpm(matrix_ptgs, log=TRUE)[baits_leaves, ] ) %>% select(grep('L', colnames(matrix_ptgs)))
baits_leaves = baits_leaves %>% mutate(cluster = rownames(baits_leaves)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1L_dicer, F2L_dicer, F3L_dicer)),
male_exp = mean(c(M1L_dicer, M2L_dicer, M4L_dicer)),
male_biased = male_exp>female_exp,
type = "Sex-biased sRNAs in leaves") %>% add_count(type)
bait_df = rbind(baits_flowers, baits_leaves)
# Get mir annotation
anno_ptgs = read.table("raw/only_21-22_known_de_novo_f-y/Results.txt", header = T)
mirnas_ptgs = anno_ptgs[, c("Name", "MIRNA")]
# Merge annotation
bait_df = bait_df %>% left_join(mirnas_ptgs, by = c("cluster"="Name"))
bait_df = bait_df %>% arrange(MIRNA)
levels(as.factor(bait_df$MIRNA)) # 3 miRNA
## [1] "N" "Y"
write.table(bait_df, file ="results/DEsRNAClusters.tsv", quote = F, col.names = F, sep = "\t", row.names = F)
# Plot all
ggplot(bait_df) +
annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
geom_jitter(aes(female_exp, male_exp, fill=male_biased, shape=MIRNA, color=MIRNA), size=4) +
geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -.5, vjust = -28) +
scale_x_continuous(limits = c(-2.5, 10)) +
scale_y_continuous(limits = c(-2.5, 10)) +
labs(x= "♀ logRPM", y="♂ logRPM") +
scale_fill_manual(values = met.brewer("Derain")) +
scale_shape_manual(values = c(21, 19)) +
scale_color_manual(values = c("black", "#2471a3")) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1) +
facet_grid(~type)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1908 rows containing missing values (`geom_point()`).
ggsave("results/desrnas.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1908 rows containing missing values (`geom_point()`).
rm(baits_flowers, baits_leaves, mirnas_ptgs)
# clean and transform data
kall = read.table(file = "rna-seq/kallisto/raw_counts.tsv", header = T)
rownames(kall) = kall$target_id
kall = kall[, -1]
# Round counts cause Kallisto returns decimals after the bootstrapping
kall = round(kall)
# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(kall)>0.5) >= 2
summary(keep) # 590 genes will be removed
## Mode FALSE TRUE
## logical 590 12481
kall = kall[keep,]
rm(keep)
# Prepare data info
# Four female plants (C1_26, C1_27, C1_29, C1_34) and four males (C1_01, C1_03, C1_04, C1_05) were sequenced for flower buds ("_B_" in names) and leaves ("_L_" in names).
kall_info = data.frame(sex = c(rep("Male", 8), rep("Female", 8)),
organ = rep(c("Flower bud", "Leaf"), 8),
row.names = colnames(kall))
# check all samples are included and in the same order
all(colnames(kall) %in% rownames(kall_info))
## [1] TRUE
all(colnames(kall) == rownames(kall_info))
## [1] TRUE
# Read TPM data and transform
tpm = read.table("rna-seq/kallisto/tpm.tsv", header = T)
rownames(tpm) = tpm$target_id
tpm = tpm[, -1]
# transform data
transposed = t(tpm)
pca_exp = PCA(transposed, ncp = 20, graph = F)
pcadf = data.frame(tissue = paste0(kall_info$sex, ", ", kall_info$organ),
pca1 = as.data.frame(pca_exp$ind)[,1],
pca2 = as.data.frame(pca_exp$ind)[,2])
ggplot(pcadf) +
geom_hline(yintercept = 0, color="gray40")+
geom_vline(xintercept = 0, color="gray40")+
geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
guides(fill = guide_legend(override.aes = list(size=8))) +
coord_fixed(ratio = 1.5) +
theme_bw(base_size = 20) +
theme(legend.position = c(0.25, 0.8),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.background = element_rect(fill = "gray"),
aspect.ratio=1)
ggsave("results/pca_sbge.png", width = 8, height = 6, dpi = 600)
rm(pcadf, pca_exp)
# Select flower bud samples
fsbge = kall %>% select(grep('B', colnames(kall)))
fsbge_info = kall_info %>% filter(organ=="Flower bud")
# define groups (in the same order as the table)
fgroups = kall_info[kall_info$organ == "Flower bud", ]$sex
flowers_genes = pyramide(fsbge, fsbge_info, ~sex, fgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.06511 , BCV = 0.2552
## Removing intercept from test coefficients
rm(fsbge, fsbge_info, fgroups)
# Select leaf samples
lsbge = kall %>% select(grep('L', colnames(kall)))
lsbge_info = kall_info %>% filter(organ=="Leaf")
# define groups (in the same order as the table)
lgroups = kall_info[kall_info$organ == "Leaf", ]$sex
leaves_genes = pyramide(lsbge, lsbge_info, ~sex, lgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.16956 , BCV = 0.4118
## Removing intercept from test coefficients
rm(lsbge, lsbge_info, lgroups)
# Select female samples
fet_sbge = kall %>% select(rownames(kall_info[kall_info$sex=="Female", ]))
fet_sbge_info = kall_info %>% filter(sex=="Female")
# define groups (in the same order as the table)
fegroups = kall_info[kall_info$sex == "Female", ]$organ
female_genes = pyramide(fet_sbge, fet_sbge_info, ~organ, fegroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.13265 , BCV = 0.3642
## Removing intercept from test coefficients
rm(fet_sbge, fet_sbge_info, fegroups)
# Select male samples
mt_sbge = kall %>% select(rownames(kall_info[kall_info$sex=="Male", ]))
mt_sbge_info = kall_info %>% filter(sex=="Male")
# define groups (in the same order as the table)
mgroups = kall_info[kall_info$sex == "Male", ]$organ
male_genes = pyramide(mt_sbge, mt_sbge_info, ~organ, mgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.1018 , BCV = 0.3191
## Removing intercept from test coefficients
rm(mt_sbge, mt_sbge_info, mgroups)
sbge_list = list("Sex-biased in flower buds" = flowers_genes[["clusters"]],
"Sex-biased in leaves" = leaves_genes[["clusters"]],
"Tissue-biased in females" = female_genes[["clusters"]],
"Tissue-biased in males" = male_genes[["clusters"]])
upset(fromList(sbge_list),
nintersects = 8,
mainbar.y.label = "Shared DE genes",
sets.x.label = "DE genes",
matrix.color = met.brewer("Egypt")[2],
main.bar.color = met.brewer("Egypt")[3],
sets.bar.color = met.brewer("Egypt")[4],
shade.color = met.brewer("Egypt")[2],
point.size = 10,
line.size = 4,
text.scale = c(3, 4, 3, 4, 5, 4),
mb.ratio = c(0.6, 0.4))
rm(sbge_list)
# Sex-biased in flowers
baits_flowers = flowers_genes[["clusters"]]
baits_flowers = as.data.frame(tpm[baits_flowers, ]) %>% select(grep('B', colnames(tpm)))
baits_flowers = baits_flowers %>% mutate(gene = rownames(baits_flowers)) %>% group_by(gene) %>% summarise(female_exp = mean(c(C1_26_B, C1_27_B, C1_29_B_combined, C1_34_B_combined)),
male_exp = mean(c(C1_01_B, C1_03_B, C1_04_B_combined, C1_05_B_combined)),
male_biased = male_exp>female_exp,
type = "Sex-biased genes in flowers") %>% add_count(type)
# Sex-biased in leaves
baits_leaves = leaves_genes[["clusters"]]
baits_leaves = as.data.frame(tpm[baits_leaves, ]) %>% select(grep('L', colnames(tpm)))
baits_leaves = baits_leaves %>% mutate(gene = rownames(baits_leaves)) %>% group_by(gene) %>% summarise(female_exp = mean(c(C1_26_L, C1_27_L, C1_29_L, C1_34_L)),
male_exp = mean(c(C1_01_L, C1_03_L, C1_04_L, C1_05_L)),
male_biased = male_exp>female_exp,
type = "Sex-biased genes in leaves") %>% add_count(type)
# Merge
bait_df = rbind(baits_flowers, baits_leaves)
# Plot all
ggplot(bait_df) +
annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
geom_jitter(aes(female_exp, male_exp, fill=male_biased), shape=21, size=4) +
geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -3, vjust = -28) +
scale_x_continuous(limits = c(-2.5, 10)) +
scale_y_continuous(limits = c(-2.5, 10)) +
labs(x= "♀ TPM", y="♂ TPM") +
scale_fill_manual(values = met.brewer("Derain")) +
scale_y_continuous(limits = c(0,10)) +
scale_x_continuous(limits = c(0,10)) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1) +
facet_grid(~type)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning: Removed 1332 rows containing missing values (`geom_point()`).
ggsave("results/sbge.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1329 rows containing missing values (`geom_point()`).
rm(baits_leaves, baits_flowers)
# formatting of the data
sRNA_clusters = as.data.table(read.table("raw/only_21-22_known_de_novo_f-y/Results.txt", header = T)) # transform to data table
sRNA_clusters[, min := min(Start, End), by=1:nrow(sRNA_clusters)] # unstrand start
sRNA_clusters[, max := max(Start, End), by=1:nrow(sRNA_clusters)] # unstrand end
# get gene annotations, already including +-400 bp
genesCoordinates = fread("annotation/full_genes.bed")
colnames(genesCoordinates) = c("chr", "start", "end", "gene_name", "score", "strand")
genesCoordinates[, min := min(start, end), by=1:nrow(genesCoordinates)]
genesCoordinates[, max := max(start, end), by=1:nrow(genesCoordinates)]
genesCoordinates = genesCoordinates[!duplicated(genesCoordinates$gene_name),] # remove duplicates
# select for sex-DE of genes/sRNAs in Flowers
sRNA_clusters_bait = sRNA_clusters[Name %in% flowers[["clusters"]]] # Select DE sNRAs
genesCoordinates_bait = genesCoordinates[gene_name %in% flowers_genes[["clusters"]]] # select DE genes
# Initiate data.table containing the header
header = list('gene_name', 'overlapping_cluster', 'comparison')
fwrite(header, 'results/overlaps/overlapping_clusters_ptgs.txt', append = FALSE, sep='\t')
# For each gene, get all overlapping siRNA clusters
for (i in seq(1, nrow(genesCoordinates_bait))) {
# Current gene info
CurrentGene <- genesCoordinates_bait$gene_name[i]
CurrentChr <- genesCoordinates_bait$chr[i]
Currentmin <- genesCoordinates_bait$min[i]
Currentmax <- genesCoordinates_bait$max[i]
# make a subset of the siRNA cluster data table for clusters overlapping the gene
# overlap happens if cluster minimum is lower than gene maximum AND cluster maximum is higher than gene minimum
siRNAsubset <- sRNA_clusters_bait[(Chrom == CurrentChr)&(min < Currentmax)&(max > Currentmin)]
# write a line with output and append it to output data.table
# line <- list(CurrentGene, nrow(siRNAsubset))
line = as.data.frame(siRNAsubset$Name) %>% mutate(CurrentGene, "sex_biased_in_flowers") %>% relocate(CurrentGene)
fwrite(line, 'results/overlaps/overlapping_clusters_ptgs.txt', append = TRUE, sep='\t')
}
###########################################################################################################################
# select for sex-DE of genes/sRNAs in leaves
sRNA_clusters_bait = sRNA_clusters[Name %in% leaves[["clusters"]]] # Select DE sNRAs
genesCoordinates_bait = genesCoordinates[gene_name %in% leaves_genes[["clusters"]]] # select DE genes
# For each gene, get all overlapping siRNA clusters
for (i in seq(1, nrow(genesCoordinates_bait))) {
# Current gene info
CurrentGene <- genesCoordinates_bait$gene_name[i]
CurrentChr <- genesCoordinates_bait$chr[i]
Currentmin <- genesCoordinates_bait$min[i]
Currentmax <- genesCoordinates_bait$max[i]
# make a subset of the siRNA cluster data table for clusters overlapping the gene
# overlap happens if cluster minimum is lower than gene maximum AND cluster maximum is higher than gene minimum
siRNAsubset <- sRNA_clusters_bait[(Chrom == CurrentChr)&(min < Currentmax)&(max > Currentmin)]
# write a line with output and append it to output data.table
# line <- list(CurrentGene, nrow(siRNAsubset))
line = as.data.frame(siRNAsubset$Name) %>% mutate(CurrentGene, "sex_biased_in_leaves") %>% relocate(CurrentGene)
fwrite(line, 'results/overlaps/overlapping_clusters_ptgs.txt', append = TRUE, sep='\t')
}
# transform and add info to sRNA quantification
transposed = transpose(matrix_ptgs)
colnames(transposed) = rownames(matrix_ptgs)
rownames(transposed) = colnames(matrix_ptgs)
transposed = cpm(transposed, log = T) # transform to logRPM
# add samples as column
transposed = as.data.frame(transposed) %>% mutate(sample = rownames(transposed)) %>% relocate(sample)
datainfo_s = datainfo %>% mutate(sample = rownames(datainfo)) %>% relocate(sample)
# combine
fermented_srna = right_join(datainfo_s, transposed, by = "sample")
# transform and add info to gene quant
fermented_genes = transpose(tpm)
colnames(fermented_genes) = rownames(tpm)
rownames(fermented_genes) = colnames(tpm)
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample)
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample)
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")
# Get overlapping clusters
over_flowers = read_table("results/overlaps/flowers.tsv", col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
colnames(over_flowers) = c("gene_name", "cluster")
over_flowers = filter(over_flowers, gene_name=="Silat_chr12_Gene.1440")
# leaves
over_flowers = read_table("results/overlaps/leaves.tsv", col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
colnames(over_flowers) = c("gene_name", "cluster")
over_flowers = filter(over_flowers, gene_name=="Silat_scaffold_13_Gene.43660", cluster=="Cluster_45289") # TE with precursor
# over_flowers = filter(over_flowers, gene_name=="Silat_chr12_Gene.3592") # with overlap
# experiment 1, plot
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
fermented_srna_bait = subset(fermented_srna, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$cluster)))) %>% mutate(element = "srna")
fermented_srna_bait = melt(fermented_srna_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
dough = rbind(fermented_gene_bait, fermented_srna_bait)
dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ) %>% unique() %>%
ggplot(aes(x=sex)) +
facet_wrap(~organ, nrow = 1) +
geom_point(aes(y= mean, color = name), size=4) +
geom_line(aes(y= mean, group = name, color=name), size=2) +
scale_color_manual(values = met.brewer("Egypt")) +
labs(x=NULL, y="Mean Expression", color=NULL) +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.
#ggsave("results/bait_exploration.png", width = 8, height = 8, dpi = 600)
#ggsave("results/flower_candidate_precursor.png", width = 8, height = 8, dpi = 600)
ggsave("results/leaves_candidate_precursor.png", width = 8, height = 8, dpi = 600)
# get chr boundaries again
chr_sizes = read.table("chromSizes.txt")
colnames(chr_sizes) = c("chr", "end")
chr_sizes = chr_sizes %>% mutate(start = "1") %>% relocate(start, .before = end)
# load gene bed
gene_bed = read.table("annotation/genes.bed")
colnames(gene_bed) = c("chr", "start", "end", "gene", "score", "strand")
# Distribution of inverted repeats and TE annotation
#inv_bed = read.table("annotation/inv_rep.bed")
#tes_bed = read.table("annotation/tes.bed", sep = "\t")
# load candidate info in flowers
cand_flowers = read.table("results/overlaps/flowers_precursos.tsv")
colnames(cand_flowers) = c("inv_rep", "chr", "start", "end", "sRNA", "gene")
# merge info
cand_flowers = left_join(cand_flowers, gene_bed, by="gene")
# plot
{
png(file="results/candidate_flowers.png", width=1000, height=1000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=1)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
#circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(inv_bed, col = "blue", track.height = 0.1, count_by = "number")
circos.genomicLink(region1 = cand_flowers[, c("chr.x", "start.x", "end.x")], region2 = cand_flowers[, c("chr.y", "start.y", "end.y")], col = "#5EBF75", lwd=6, direction=1, arr.width=0.5)
circos.clear()
dev.off()
}
## png
## 2
# load candidate info in leaves
cand_leaves = read.table("results/overlaps/leaves_precursos.tsv")
colnames(cand_leaves) = c("inv_rep", "chr", "start", "end", "sRNA", "gene")
# merge gene info
cand_leaves = left_join(cand_leaves, gene_bed, by="gene")
# plot
{
png(file="results/candidate_leaves.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
#circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(inv_bed, col = "blue", track.height = 0.1, count_by = "number")
circos.genomicLink(region1 = cand_leaves[, c("chr.x", "start.x", "end.x")], region2 = cand_leaves[, c("chr.y", "start.y", "end.y")], col = "#5EBF75", lwd=6, direction=1, arr.width=1)
circos.clear()
dev.off()
}
## png
## 2
rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed)
## Warning in rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed): object
## 'inv_bed' not found
## Warning in rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed): object
## 'tes_bed' not found
# sex-biased PTGS interaction
chi_flower = matrix(c(c(10, 758),
c(32, 1764)),
nrow = 2, byrow = T)
colnames(chi_flower) = c("number of overlaps to sex-biased clusters", "number of overlaps to unbiased clusters")
rownames(chi_flower) = c("sex-biased genes", "unbiased genes")
chi_leaves = matrix(c(c(3, 38),
c(27, 2220)),
nrow = 2, byrow = T)
colnames(chi_leaves) = c("number of overlaps to sex-biased clusters", "number of overlaps to unbiased clusters")
rownames(chi_leaves) = c("sex-biased genes", "unbiased genes")
# Perform chi-square test
chisq.test(chi_flower) #the observed values don't significantly differ from the expected values
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chi_flower
## X-squared = 0.4993, df = 1, p-value = 0.4798
chisq.test(chi_leaves) #the observed values significantly differ from the expected values
## Warning in chisq.test(chi_leaves): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chi_leaves
## X-squared = 7.3912, df = 1, p-value = 0.006554
rm(chi_flower, chi_leaves)
# We calculate depths for each sex-biased gene in Flowers
surco_gene = function(toy, lib_size_1, lib_size_2, lib_size_3) {
# set column names
colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "gene")
# calculate mead depth per window, normalize by library size
toy[, mean_depth_1 := mean(depth_1)/lib_size_1, by=.(gene)]
toy[, mean_depth_2 := mean(depth_2)/lib_size_2, by=.(gene)]
toy[, mean_depth_3 := mean(depth_3)/lib_size_3, by=.(gene)]
# get one row per window
toy = unique(toy, by = "gene")[, .(gene, mean_depth_1, mean_depth_2, mean_depth_3)]
# overall mean and multiply by 1m
toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
toy[, log_depth := log2(mean_depth), by=1:nrow(toy)]
# print bed file + mean depth per million
toy[, .(gene, log_depth, mean_depth)]
}
# Females
flo_sbg_fe_d = fread("results/gene-level/flowers_depth_females.tsv")
flo_sbg_fe_d = surco_gene(flo_sbg_fe_d, 54935002, 29689313, 25245689)
colnames(flo_sbg_fe_d) = c("gene", "log_female_exp", "female_exp")
# Males
flo_sbg_ma_d = fread("results/gene-level/flowers_depth_males.tsv")
flo_sbg_ma_d = surco_gene(flo_sbg_ma_d, 13224600, 23943681, 23582102)
colnames(flo_sbg_ma_d) = c("gene", "log_male_exp", "male_exp")
flo_depth = full_join(flo_sbg_fe_d, flo_sbg_ma_d, by = "gene")
flo_depth$log_female_exp[is.na(flo_depth$log_female_exp)] = -5 # replace NA for "0" mapping
flo_depth$log_male_exp[is.na(flo_depth$log_male_exp)] = -5
# Plot
ggplot(flo_depth) +
annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
geom_jitter(aes(log_female_exp, log_male_exp), color=met.brewer("Derain", 1), size=4) +
scale_x_continuous(limits = c(-5, 4)) +
scale_y_continuous(limits = c(-5, 4)) +
labs(x= "♀ log2RPM sRNA", y="♂ log2RPM sRNA") +
scale_fill_manual(values = met.brewer("Derain")) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Removed 909 rows containing missing values (`geom_point()`).
ggsave("results/gene-level_mapping.png", width = 8, height = 6, dpi = 600)
## Warning: Removed 913 rows containing missing values (`geom_point()`).
rm(flo_depth, flo_sbg_fe_d, flo_sbg_ma_d)
exp_rddm = read.table("raw/only_24_f-y/Counts.txt", header = T)
# sRNA clusters names will be kept as row names, remove columns with ids
matrix_rddm = exp_rddm[, 4:15]
row.names(matrix_rddm) = paste(exp_rddm$Name, "_RdDM", sep = "") # so we can differentiate from PTGS
# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(matrix_rddm)>0.5) >= 2
summary(keep) # 132 will be removed
## Mode FALSE TRUE
## logical 132 45256
matrix_rddm = matrix_rddm[keep,]
rm(keep)
# we take the data info from PTGS, assuming it is in the environment
### All together
transposed = t(cpm(exp_rddm[, 4:15], log = T))
colnames(transposed) = exp_rddm$Name
pca_exp = PCA(transposed, ncp = 20, graph = F)
pcadf = data.frame(samples,
pca1 = as.data.frame(pca_exp$ind)[,1],
pca2 = as.data.frame(pca_exp$ind)[,2])
ggplot(pcadf) +
geom_hline(yintercept = 0, color="gray40")+
geom_vline(xintercept = 0, color="gray40")+
geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
guides(fill = guide_legend(override.aes = list(size=8))) +
coord_fixed(ratio = 1.5) +
theme_bw(base_size = 20) +
theme(legend.position = c(0.8, 0.7),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.background = element_rect(fill = "gray"),
aspect.ratio=1)
ggsave("results/pca_rddm.png", width = 8, height = 6, dpi = 600)
rm(pcadf, pca_exp)
# Select samples
frddm = matrix_rddm %>% select(grep('B', colnames(matrix_rddm)))
frddm_info = datainfo %>% filter(organ=="Flower bud")
# define groups (in the same order as the table)
fgroups = datainfo[datainfo$organ == "Flower bud", ]$sex
flowers_rddm = pyramide(frddm, frddm_info, ~sex, fgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.72015 , BCV = 0.8486
## Removing intercept from test coefficients
rm(frddm, frddm_info, fgroups)
# Select samples
lrddm = matrix_rddm %>% select(grep('L', colnames(matrix_rddm)))
lrddm_info = datainfo %>% filter(organ=="Leaf")
# define groups (in the same order as the table)
lgroups = datainfo[datainfo$organ == "Leaf", ]$sex
leaves_rddm = pyramide(lrddm, lrddm_info, ~sex, lgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.79975 , BCV = 0.8943
## Removing intercept from test coefficients
rm(lrddm, lrddm_info, lgroups)
# Select samples
ferddm = matrix_rddm %>% select(grep('F', colnames(matrix_rddm)))
ferddm_info = datainfo %>% filter(sex=="Female")
# define groups (in the same order as the table)
fegroups = datainfo[datainfo$sex == "Female", ]$organ
females_rddm = pyramide(ferddm, ferddm_info, ~organ, fegroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.6601 , BCV = 0.8125
## Removing intercept from test coefficients
rm(ferddm, ferddm_info, fegroups)
# Select samples
marddm = matrix_rddm %>% select(grep('M', colnames(matrix_rddm)))
marddm_info = datainfo %>% filter(sex=="Male")
# define groups (in the same order as the table)
magroups = datainfo[datainfo$sex == "Male", ]$organ
males_rddm = pyramide(marddm, marddm_info, ~organ, magroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.85816 , BCV = 0.9264
## Removing intercept from test coefficients
rm(marddm, marddm_info, magroups)
rddm_list = list("Sex-biased in flower buds" = flowers_rddm[["clusters"]],
"Sex-biased in leaves" = leaves_rddm[["clusters"]],
"Tissue-biased in females" = females_rddm[["clusters"]],
"Tissue-biased in males" = males_rddm[["clusters"]])
upset(fromList(rddm_list),
nintersects = 8,
mainbar.y.label = "Shared DE sRNA clusters",
sets.x.label = "DE sRNA clusters",
matrix.color = met.brewer("Egypt")[2],
main.bar.color = met.brewer("Egypt")[3],
sets.bar.color = met.brewer("Egypt")[4],
shade.color = met.brewer("Egypt")[2],
point.size = 10,
line.size = 4,
text.scale = c(3, 4, 3, 4, 5, 4)
)
rm(rddm_list)
# Sex-biased in flowers
baits_flowers = flowers_rddm[["clusters"]]
baits_flowers = as.data.frame(cpm(matrix_rddm, log=TRUE)[baits_flowers, ] ) %>% select(grep('B', colnames(matrix_rddm)))
baits_flowers = baits_flowers %>% mutate(cluster = rownames(baits_flowers)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1B_dicer, F2B_dicer, F3B_dicer)),
male_exp = mean(c(M1B_dicer, M2B_dicer, M4B_dicer)),
male_biased = male_exp>female_exp,
type = "Sex-biased sRNAs in flowers") %>% add_count(type)
# Sex-biased in leaves
baits_leaves = leaves_rddm[["clusters"]]
baits_leaves = as.data.frame(cpm(matrix_rddm, log=TRUE)[baits_leaves, ] ) %>% select(grep('L', colnames(matrix_rddm)))
baits_leaves = baits_leaves %>% mutate(cluster = rownames(baits_leaves)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1L_dicer, F2L_dicer, F3L_dicer)),
male_exp = mean(c(M1L_dicer, M2L_dicer, M4L_dicer)),
male_biased = male_exp>female_exp,
type = "Sex-biased sRNAs in leaves") %>% add_count(type)
bait_df = rbind(baits_flowers, baits_leaves)
# Plot all
ggplot(bait_df) +
annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
geom_jitter(aes(female_exp, male_exp, fill=male_biased), size=4, shape=21, color="black") +
geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -.5, vjust = -28) +
scale_x_continuous(limits = c(-2.5, 10)) +
scale_y_continuous(limits = c(-2.5, 10)) +
labs(x= "♀ logRPM", y="♂ logRPM") +
scale_fill_manual(values = met.brewer("Derain")) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1) +
facet_grid(~type)
## Warning: Removed 1912 rows containing missing values (`geom_point()`).
ggsave("results/desrnas_rddm.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1912 rows containing missing values (`geom_point()`).
rm(baits_flowers, baits_leaves)
# transform and add info to sRNA quantification
transposed = transpose(matrix_rddm)
colnames(transposed) = rownames(matrix_rddm)
rownames(transposed) = colnames(matrix_rddm)
transposed = cpm(transposed, log = T) # transform to logRPM
# add samples as column
transposed = as.data.frame(transposed) %>% mutate(sample = rownames(transposed)) %>% relocate(sample)
datainfo_s = datainfo %>% mutate(sample = rownames(datainfo)) %>% relocate(sample)
# combine
fermented_rddm = right_join(datainfo_s, transposed, by = "sample")
# transform and add info to gene quant
fermented_genes = transpose(kall)
colnames(fermented_genes) = rownames(kall)
rownames(fermented_genes) = colnames(kall)
fermented_genes = cpm(fermented_genes, log = T) # transform to logTPM
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample)
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample)
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")
# Get overlapping clusters
over_flowers = read_table("results/rddm/flowers.tsv", col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
colnames(over_flowers) = c("cluster", "gene_name")
over_flowers = over_flowers %>% relocate(gene_name)
# add an ID to each intersection (overlap gene and sRNA)
over_flowers = over_flowers %>% mutate(overlap = paste(gene_name, cluster, sep = " // "))
# get expression data of the genes
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = filter(fermented_gene_bait, organ=="Flower bud")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_gene_bait = left_join(fermented_gene_bait, over_flowers[,c("gene_name", "overlap")], by=join_by("name"=="gene_name")) #somehow it gets filled in missing samples?
## Warning in left_join(fermented_gene_bait, over_flowers[, c("gene_name", : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 73 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# mow for the sRNA
fermented_rddm_bait = subset(fermented_rddm, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$cluster)))) %>% mutate(element = "srna")
fermented_rddm_bait = filter(fermented_rddm_bait, organ=="Flower bud")
fermented_rddm_bait = melt(fermented_rddm_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_rddm_bait = left_join(fermented_rddm_bait, over_flowers[,c("cluster", "overlap")], by=join_by("name"=="cluster"))
# merge data
dough = rbind(fermented_gene_bait, fermented_rddm_bait)
# plot
dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>%
ggplot(aes(x=sex)) +
facet_wrap(~overlap, nrow = 5, ncol=5) +
geom_point(aes(y= mean, color = element), size=4) +
geom_line(aes(y= mean, group = name, color=element), size=2) +
scale_color_manual(values = met.brewer("Egypt")) +
labs(x=NULL, y="Mean Expression", color=NULL) +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.
ggsave("results/rddm_flowers_candidates.png", width = 36, height = 36, dpi = 600)
# plot only the best candidates (manual selection)
best_flowers = filter(over_flowers, gene_name %in% c("Silat_chr1_Gene.4771",
"Silat_chr12_Gene.14608",
"Silat_chr12_Gene.26394",
"Silat_chr2_Gene.30622",
"Silat_chr3_Gene.37746",
"Silat_chr9_Gene.46125"))$overlap
dough %>% filter(overlap %in% best_flowers) %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>% left_join(over_flowers[,c("gene_name", "overlap")], by="overlap") %>%
ggplot(aes(x=sex)) +
facet_wrap(~gene_name, nrow = 2, ncol=3) +
geom_point(aes(y= mean, color = element), size=4) +
geom_line(aes(y= mean, group = name, color=element), size=2) +
scale_color_manual(values = met.brewer("Egypt")) +
labs(x=NULL, y="Mean Expression", color=NULL) +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.
ggsave("results/rddm_flowers_candidates_best.png", width = 14, height = 8, dpi = 600)
###########################################################################
# now for leaves
over_leaves = read_table("results/rddm/leaves.tsv", col_names = F)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
colnames(over_leaves) = c("cluster", "gene_name")
over_leaves = over_leaves %>% relocate(gene_name)
# add an ID to each intersection (overlap gene and sRNA)
over_leaves = over_leaves %>% mutate(overlap = paste(gene_name, cluster, sep = " // "))
# get expression data of the genes
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_leaves$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = filter(fermented_gene_bait, organ=="Leaf")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_gene_bait = left_join(fermented_gene_bait, over_leaves[,c("gene_name", "overlap")], by=join_by("name"=="gene_name")) #somehow it gets filled in missing samples?
## Warning in left_join(fermented_gene_bait, over_leaves[, c("gene_name", "overlap")], : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 89 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# mow for the sRNA
fermented_rddm_bait = subset(fermented_rddm, select = c("sample", "sex", "organ", levels(as.factor(over_leaves$cluster)))) %>% mutate(element = "srna")
fermented_rddm_bait = filter(fermented_rddm_bait, organ=="Leaf")
fermented_rddm_bait = melt(fermented_rddm_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_rddm_bait = left_join(fermented_rddm_bait, over_leaves[,c("cluster", "overlap")], by=join_by("name"=="cluster"))
# merge data
dough = rbind(fermented_gene_bait, fermented_rddm_bait)
# plot
dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>%
ggplot(aes(x=sex)) +
facet_wrap(~overlap, nrow = 5, ncol=3) +
geom_point(aes(y= mean, color = element), size=4) +
geom_line(aes(y= mean, group = name, color=element), size=2) +
scale_color_manual(values = met.brewer("Egypt")) +
labs(x=NULL, y="Mean Expression", color=NULL) +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.
ggsave("results/rddm_leaves_candidates.png", width = 20, height = 36, dpi = 600)
# plot the best ones
best_leaves = filter(over_leaves, gene_name %in% c("Silat_chr10_Gene.49028",
"Silat_chr12_Gene.2070",
"Silat_chr9_Gene.46125"))$overlap
dough %>% filter(overlap %in% best_leaves) %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>% left_join(over_leaves[,c("gene_name", "overlap")], by="overlap") %>%
ggplot(aes(x=sex)) +
facet_wrap(~gene_name, nrow = 2, ncol=3) +
geom_point(aes(y= mean, color = element), size=4) +
geom_line(aes(y= mean, group = name, color=element), size=2) +
scale_color_manual(values = met.brewer("Egypt")) +
labs(x=NULL, y="Mean Expression", color=NULL) +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.
ggsave("results/rddm_leaves_candidates_best.png", width = 14, height = 4, dpi = 600)
######################
rm(transposed, fermented_gene_bait, fermented_genes, fermented_rddm, fermented_rddm_bait, dough)
# select manually curated genes
best_flowers = filter(over_flowers, gene_name %in% c("Silat_chr1_Gene.4771",
"Silat_chr12_Gene.14608",
"Silat_chr12_Gene.26394",
"Silat_chr2_Gene.30622",
"Silat_chr3_Gene.37746",
"Silat_chr9_Gene.46125"))
best_leaves = filter(over_leaves, gene_name %in% c("Silat_chr10_Gene.49028",
"Silat_chr12_Gene.2070",
"Silat_chr9_Gene.46125"))
# get annotations
rddm_bed = read.table("results/rddm/rddm.bed")
colnames(rddm_bed) = c("chr_cluster", "start_cluster", "end_cluster", "cluster")
long_genes_bed = read.table("annotation/genes.bed")[,1:4] # I firts loaded the +2400bp but it is acctually not what we want
colnames(long_genes_bed) = c("chr_gene", "start_gene", "end_gene", "gene_name")
# merge annotations
best_flowers = left_join(best_flowers, long_genes_bed, by="gene_name") %>% left_join(rddm_bed, by="cluster")
best_leaves = left_join(best_leaves, long_genes_bed, by="gene_name") %>% left_join(rddm_bed, by="cluster")
# operations with coordinates
best_flowers = best_flowers %>% mutate(position = ifelse(start_gene > end_cluster, "upstream",
ifelse(start_cluster > end_gene, "downstream",
ifelse((start_cluster > start_gene) & (end_cluster < end_gene), "gene body", "NA")))) %>%
mutate(distance = ifelse(position=="upstream", (start_gene - start_cluster)/1000,
ifelse(position=="downstream", (start_cluster - end_gene)/1000, 0)))
best_leaves = best_leaves %>% mutate(position = ifelse(start_gene > end_cluster, "upstream",
ifelse(start_cluster > end_gene, "downstream",
ifelse((start_cluster > start_gene) & (end_cluster < end_gene), "gene body", "NA")))) %>%
mutate(distance = ifelse(position=="upstream", (start_gene - start_cluster)/1000,
ifelse(position=="downstream", (start_cluster - end_gene)/1000, 0)))
# plot
rbind(mutate(best_flowers, tissue="Flower buds"), mutate(best_leaves, tissue="Leaves")) %>% ggplot() +
facet_grid(~tissue, scales = "free_y") +
geom_point(aes(y=gene_name, x=distance, color=position), size=6) +
geom_segment(aes(y=gene_name, yend=gene_name, x=0, xend=distance, color=position), size= 2) +
labs(x="Distance to gene (Kbp)", y=NULL, color="Position of the sRNA cluster") +
scale_color_manual(values = rev(met.brewer("Juarez"))) +
theme_bw() +
theme(text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
panel.spacing = unit(1, "lines"),
aspect.ratio=1)
ggsave("results/rddm_distances.png", width = 12, height = 8, dpi = 600)
# TEs surrounding the transcription factor
read.table("results/rddm/tes_around_genes.tsv")[,-c(8,10,11)] %>% mutate(up = V2-V6, down = V6-V3, length = V7-V6) # some stats
## V1 V2 V3 V4 V5 V6 V7
## 1 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1837703 1837859
## 2 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1837860 1837885
## 3 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1838213 1838685
## 4 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1838729 1838781
## 5 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1838730 1838791
## 6 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1843979 1844190
## 7 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1844139 1844197
## 8 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1844970 1845139
## 9 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1844990 1845017
## 10 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1845018 1845168
## 11 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1846017 1846043
## 12 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1846038 1846099
## 13 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1846044 1846113
## 14 chr3 1839790 1844111 Silat_chr3_Gene.37746 chr3 1846070 1846132
## 15 chr12 18331848 18335199 Silat_chr12_Gene.14608 chr12 18335464 18335560
## V9 up down length
## 1 Motif:rnd-5_family-2244 2087 -6408 156
## 2 Motif:RLC_SIRE_76712_FL_DL 1930 -6251 25
## 3 Motif:RLG_Tekay_54006_partial_D 1577 -5898 472
## 4 Motif:rnd-1_family-420 1061 -5382 52
## 5 Motif:rnd-1_family-420 1060 -5381 61
## 6 Motif:rnd-1_family-528 -4189 -132 211
## 7 Motif:RLG_Retand_14683_FL_DLTP -4349 28 58
## 8 Motif:RLG_Retand_90523_FL_DL -5180 859 169
## 9 Motif:RLG_Tekay_57285_FL_DL -5200 879 27
## 10 Motif:rnd-4_family-47 -5228 907 150
## 11 Motif:RLC_SIRE_90603_FL_DL -6227 1906 26
## 12 Motif:RLG_Athila_31436_FL_DLTP -6248 1927 61
## 13 Motif:RLG_CRM_70870_FL_DL -6254 1933 69
## 14 Motif:RLG_Reina_85746_FL_DL -6280 1959 62
## 15 Motif:RLG_Tekay_29122_FL_DLP -3616 265 96
# plot together the gene, sRNA and TEs
tf_vecinos = as.data.frame(mapply(c,
filter(rddm_bed, cluster=="Cluster_10374_RdDM") %>% mutate("sRNA cluster"),
read.table("results/rddm/clusters_around_genes.tsv")[, 5:8] %>% mutate("sRNA cluster"),
read.table("results/rddm/tes_around_genes.tsv")[1, 1:4] %>% mutate("Gene"),
read.table("results/rddm/tes_around_genes.tsv")[-15, c(5,6,7,9)] %>% mutate(V9 = paste(V9, 1:14, sep = "_"), "TEs")
))
colnames(tf_vecinos) = c("chr", "start", "end", "name", "element")
tf_vecinos$element = factor(tf_vecinos$element, levels = rev(c("Gene", "sRNA cluster", "TEs")))
ggplot(tf_vecinos, aes(xmin = as.numeric(start), xmax = as.numeric(end), y = element, fill = element)) +
geom_gene_arrow() +
labs(x="Chromosome 3", y=NULL) +
scale_fill_manual(values = rev(met.brewer("Egypt", 3))) +
scale_x_continuous(limits = c(1837000, 1846000)) +
theme_light() +
theme(text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
## Warning: Removed 4 rows containing missing values (`geom_gene_arrow()`).
# THE LENGHTS IN THE PLOT WILL BE WRONG IF THEY ARE NOT CONTINOUS
ggsave("results/tf_vecinity.png", width = 8, height = 3, dpi = 600)
## Warning: Removed 4 rows containing missing values (`geom_gene_arrow()`).
rm(rddm_bed, long_genes_bed, best_flowers, best_leaves, tf_vecinos)
# We calculate depths for each sex-biased gene in Flowers
# NOTE: genes without sRNA mappping will NOT appear here
# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
flo_rddm_fe = surco_gene(flo_rddm_fe, 54935002, 29689313, 25245689)
colnames(flo_rddm_fe) = c("gene", "log_female_exp", "female_exp")
# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
flo_rddm_ma = surco_gene(flo_rddm_ma, 13224600, 23943681, 23582102)
colnames(flo_rddm_ma) = c("gene", "log_male_exp", "male_exp")
# marge and fill
flo_rddm = full_join(flo_rddm_fe, flo_rddm_ma, by="gene")
# plot
ggplot(flo_rddm) +
annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
geom_jitter(aes(log_female_exp, log_male_exp), color=met.brewer("Derain", 1), size=4) +
scale_x_continuous(limits = c(-5, 4)) +
scale_y_continuous(limits = c(-5, 4)) +
labs(x= "♀ log2RPM sRNA", y="♂ log2RPM sRNA") +
scale_fill_manual(values = met.brewer("Derain")) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Removed 96 rows containing missing values (`geom_point()`).
ggsave("results/gene-level_mapping_rddm.png", width = 8, height = 6, dpi = 600)
## Warning: Removed 96 rows containing missing values (`geom_point()`).
# select best candidates
flo_rddm_best = filter(flo_rddm, gene %in% c("Silat_chr1_Gene.4771",
"Silat_chr12_Gene.14608",
"Silat_chr12_Gene.26394",
"Silat_chr2_Gene.30622",
"Silat_chr3_Gene.37746",
"Silat_chr9_Gene.46125",
"Silat_chr10_Gene.49028",
"Silat_chr12_Gene.2070"))
flo_rddm_best = melt(flo_rddm_best, id.vars = "gene", measure.vars = c("log_female_exp", "log_male_exp"))
ggplot(flo_rddm_best, aes(x=variable)) +
facet_wrap(~gene, nrow = 4, ncol=3) +
geom_point(aes(y= value, color = gene), size=4) +
geom_line(aes(y= value, group = gene, color=gene), size=2) +
scale_color_manual(values = met.brewer("Egypt", 8)) +
labs(x=NULL, y="Mean gene-level sRNA mapping", color=NULL) +
scale_x_discrete(labels=c("log_female_exp" = "Female", "log_male_exp" = "Male")) +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
ggsave("results/gene-level_rddm_best.png", width = 16, height = 10, dpi = 600)
# new function that does not get averages
surco_best = function(toy, lib_size_1, lib_size_2, lib_size_3) {
# set column names
colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "gene")
# normalize by library size
toy[, mean_depth_1 := depth_1/lib_size_1, by=1:nrow(toy)]
toy[, mean_depth_2 := depth_2/lib_size_2, by=1:nrow(toy)]
toy[, mean_depth_3 := depth_3/lib_size_3, by=1:nrow(toy)]
# overall mean and multiply by 1m
toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
# print bed file + mean depth per million
toy[, .(gene, chr, start, mean_depth)]
}
# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
tf_fe = surco_best(flo_rddm_fe, 54935002, 29689313, 25245689)
colnames(tf_fe) = c("gene", "chr", "start", "Females")
tf_fe = filter(tf_fe, gene=="Silat_chr3_Gene.37746")
# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
tf_ma = surco_best(flo_rddm_ma, 13224600, 23943681, 23582102)
colnames(tf_ma) = c("gene", "chr", "start", "Males")
tf_ma = filter(tf_ma, gene=="Silat_chr3_Gene.37746")
# merge and reshape
full_join(tf_fe, tf_ma, by="start")[,c("start", "Females", "Males")] %>% melt(id.vars = "start", measure.vars = c("Females", "Males"), variable.name = "sex", value.name = "mapping") %>% ggplot()+
geom_col(aes(x=as.numeric(start), y=mapping, fill=sex), stat = 'identity') +
scale_y_continuous(limits = c(0,3)) +
scale_x_continuous(limits = c(1837000, 1846000)) +
scale_fill_manual(values = met.brewer("Derain")) +
labs(x=NULL, y="RPM", fill="Sex") +
theme_bw()+
theme(text = element_text(size=20),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
## Warning in geom_col(aes(x = as.numeric(start), y = mapping, fill = sex), :
## Ignoring unknown parameters: `stat`
## Warning: Removed 948 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_col()`).
ggsave("results/tf_mapping.png", width = 8, height = 2, dpi = 600)
## Warning: Removed 948 rows containing missing values (`position_stack()`).
## Removed 2 rows containing missing values (`geom_col()`).
rm(surco_best, flo_rddm_fe, flo_rddm_ma, tf_fe, tf_ma, flo_rddm, flo_rddm_best)
# get the union of DE genes (union works in pairs)
de_genes = union(flowers_genes[["clusters"]], leaves_genes[["clusters"]])
de_genes = as.matrix(cpm(kall, log = T)[de_genes, ])
# Only Flowers
{
de_genes = flowers_genes[["clusters"]]
de_genes = cpm(kall, log = T)[de_genes, ]
de_genes = as.data.frame(de_genes) %>% select(grep('B', colnames(de_genes)))
de_genes = as.matrix(de_genes)
}
# Only Leaves
{
de_genes = leaves_genes[["clusters"]]
de_genes = cpm(kall, log = T)[de_genes, ]
de_genes = as.data.frame(de_genes) %>% select(grep('L', colnames(de_genes)))
de_genes = as.matrix(de_genes)
}
# remove TE genes
#!(as.vector(unique(read.table("tesorter_genes/results.tsv")[,2])) %in% rownames(de_genes) )
!(as.vector(rownames(de_genes) %in% unique(read.table("tesorter_genes/results.tsv")[,2])))
#de_genes = de_genes[!(as.vector(unique(read.table("tesorter_genes/results.tsv")[,2])) %in% rownames(de_genes) ),]
de_genes = de_genes[!(as.vector(rownames(de_genes) %in% unique(read.table("tesorter_genes/results.tsv")[,2]))),]
# NOTE: within a total union 6678 DE genes, only 3705 are not TE-genes (55%)
# Between the sex-biased, only 243 are TE-genes
# get z-score
de_genes = t(apply(de_genes, 1, function(row) {
(row - mean(row)) / sd(row)
}))
# Calculate correlations
weight_mat = GENIE3(de_genes, nCores=16, verbose=TRUE)
dim(weight_mat)
link_list = getLinkList(weight_mat, threshold=0.005) # report all
# Check modules
my_network = graph_from_data_frame(link_list, directed = F)
modules = cluster_leiden(my_network, resolution_parameter = 0.001, objective_function = "modularity") # 1 cluster in flowers+leaves and only leaves vs 54 clusters only in flowers
# get SB gene annotation, use those from the modules to select
#network_annotation = bait_df %>% select(gene, male_biased) %>% distinct(gene, .keep_all = TRUE) %>% filter(gene %in% modules$names) %>% mutate(bait = gene == "Silat_chr12_Gene.1440") # PTGS bait from flowers
network_annotation = bait_df %>% select(gene, male_biased) %>% distinct(gene, .keep_all = TRUE) %>% filter(gene %in% modules$names) %>% mutate(bait = gene == "Silat_chr3_Gene.37746") # RdDM bait from flowers
# get number of connections (edges) per node
edges_network = data.frame(gene = as.factor(V(my_network)),
no_edges = igraph::degree(my_network, mode = "all") )
edges_network$gene = rownames(edges_network)
# merge annotations
network_annotation = left_join(network_annotation, edges_network, by="gene")
# re-anotate network
my_network = graph_from_data_frame(link_list, directed = F, vertices = network_annotation)
# PLOT
ggraph(my_network, layout = "kk", circular = F) +
geom_edge_diagonal(color = "grey70", width = 0.5, alpha = 0.8) +
geom_node_point(aes(fill = male_biased, color=bait, shape=bait, size=no_edges), alpha = 0.8) +
scale_fill_manual(values = met.brewer("Derain")) +
scale_shape_manual(values = c(21, 19)) +
scale_color_manual(values = c("black", "blue")) +
theme_void() +
theme(
text = element_text(size = 20),
legend.position = "none"
)
#ggsave("results/network.png", width = 8, height = 8, dpi = 600)
#ggsave("results/network_flowers.png", width = 8, height = 8, dpi = 600)
#ggsave("results/network_leaves.png", width = 8, height = 8, dpi = 600)
rm(de_genes, weight_mat, link_list, my_network, network_annotation, edges_network)
columns:
gene name bias average exp (tpm) 21-22 sRNA mapping (rpm) 24 sRNA mapping (rpm) sex tissue + comparison
library sizes: flowers females: 54935002, 29689313, 25245689 flowers males: 13224600, 23943681, 23582102 leaves females: 24695806, 31425418, 21443081 leaves males: 20502566, 17763390, 26371650
# get info of sex-biased genes in flowers
bias_flowers = read.table("results/differential_expression/sex_biased_flowers_sbge.bed")[, c(4,8)] %>% rename(V4="gene", V8="bias")
bias_leaves = read.table("results/differential_expression/sex_biased_leaves_sbge.bed")[, c(4,8)] %>% rename(V4="gene", V8="bias")
# get raw expression data (reload again just in case)
fermented_genes = transpose(tpm)
colnames(fermented_genes) = rownames(tpm)
rownames(fermented_genes) = colnames(tpm)
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample)
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample) %>% rename(organ="tissue")
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")
# get genes and average replicates, add bias info
# do flowers and leaves indpendently, otherwise it's a mess to merge bias info (a gene may be differently biased in flowers or leaves)
# I selected the tissue where sex-biased was measured to avoid mixing up unrelated stuff (sbge in flowers == only flower samples)
exp_flowers = subset(fermented_genes, select = c("sample", "sex", "tissue", levels(as.factor(bias_flowers$gene)))) %>% melt(id.vars = c("sample", "sex", "tissue"), variable.name="gene") %>% group_by(sex, tissue, gene) %>% summarise(gene, mean_exp = mean(value), sex, tissue) %>% ungroup() %>% unique() %>% left_join(bias_flowers, by="gene") %>% filter(tissue == "Flower bud")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'sex', 'tissue', 'gene'. You can override
## using the `.groups` argument.
exp_leaves = subset(fermented_genes, select = c("sample", "sex", "tissue", levels(as.factor(bias_leaves$gene)))) %>% melt(id.vars = c("sample", "sex", "tissue"), variable.name="gene") %>% group_by(sex, tissue, gene) %>% summarise(gene, mean_exp = mean(value), sex, tissue) %>% ungroup() %>% unique() %>% left_join(bias_leaves, by="gene") %>% filter(tissue == "Leaf")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'sex', 'tissue', 'gene'. You can override
## using the `.groups` argument.
rm(fermented_genes, datainfo_s)
####################################################################
# sRNA data
############ Flowers
#### 21/22 - PTGS
# Females
flo_ptgs_fe = fread("results/gene-level/flowers_depth_females.tsv")
flo_ptgs_fe = surco_gene(flo_ptgs_fe, 54935002, 29689313, 25245689)
flo_ptgs_fe = rbind(
flo_ptgs_fe = flo_ptgs_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_ptgs_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Flower bud")
# Males
flo_ptgs_ma = fread("results/gene-level/flowers_depth_males.tsv")
flo_ptgs_ma = surco_gene(flo_ptgs_ma, 13224600, 23943681, 23582102)
flo_ptgs_ma = rbind(
flo_ptgs_ma = flo_ptgs_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_ptgs_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Flower bud")
# concatenate
flo_ptgs = rbind(flo_ptgs_fe, flo_ptgs_ma) %>% rename(mean_depth = "ptgs")
rm(flo_ptgs_fe, flo_ptgs_ma)
#### 24 - RdDM
# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
flo_rddm_fe = surco_gene(flo_rddm_fe, 54935002, 29689313, 25245689)
flo_rddm_fe = rbind(
flo_rddm_fe = flo_rddm_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_rddm_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Flower bud")
# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
flo_rddm_ma = surco_gene(flo_rddm_ma, 13224600, 23943681, 23582102)
flo_rddm_ma = flo_rddm_ma %>% select(gene, mean_depth) %>% mutate(sex = "Male", tissue = "Flower bud") # all genes had depth info
# concatenate
flo_rddm = rbind(flo_rddm_fe, flo_rddm_ma) %>% rename(mean_depth = "rddm")
rm(flo_rddm_fe, flo_rddm_ma)
#### merge
exp_flowers = left_join(exp_flowers, flo_ptgs, by=c("gene", "sex", "tissue")) %>% left_join(flo_rddm, by=c("gene", "sex", "tissue")) %>% mutate(comparison = "Sex-biased in flowers")
rm(flo_ptgs, flo_rddm)
############ Leaves
#### 21/22 - PTGS
# Females
le_ptgs_fe = fread("results/gene-level/leaves_depth_females.tsv")
le_ptgs_fe = surco_gene(le_ptgs_fe, 24695806, 31425418, 21443081)
le_ptgs_fe = rbind(
le_ptgs_fe = le_ptgs_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_ptgs_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Leaf")
# Males
le_ptgs_ma = fread("results/gene-level/leaves_depth_males.tsv")
le_ptgs_ma = surco_gene(le_ptgs_ma, 20502566, 17763390, 26371650)
le_ptgs_ma = rbind(
le_ptgs_ma = le_ptgs_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_ptgs_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Leaf")
# concatenate
le_ptgs = rbind(le_ptgs_fe, le_ptgs_ma) %>% rename(mean_depth = "ptgs")
rm(le_ptgs_fe, le_ptgs_ma)
#### 24 - RdDM
# Females
le_rddm_fe = fread("results/rddm/leaves_depth_females.tsv")
le_rddm_fe = surco_gene(le_rddm_fe, 24695806, 31425418, 21443081)
le_rddm_fe = rbind(
le_rddm_fe = le_rddm_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_rddm_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Leaf")
# Males
le_rddm_ma = fread("results/gene-level/leaves_depth_males.tsv")
le_rddm_ma = surco_gene(le_rddm_ma, 20502566, 17763390, 26371650)
le_rddm_ma = rbind(
le_rddm_ma = le_rddm_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_rddm_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Leaf")
# concatenate
le_rddm = rbind(le_rddm_fe, le_rddm_ma) %>% rename(mean_depth = "rddm")
rm(le_rddm_fe, le_rddm_ma)
#### merge
exp_leaves = left_join(exp_leaves, le_ptgs, by=c("gene", "sex", "tissue")) %>% left_join(le_rddm, by=c("gene", "sex", "tissue")) %>% mutate(comparison = "Sex-biased in leaves")
rm(le_ptgs, le_rddm)
########################
# Merge leaves and flowers
exp_all = rbind(exp_flowers, exp_leaves)
rm(exp_flowers, exp_leaves)
# Save the table
write.table(exp_all, file = "results/sbge_ptgs_rddm.tsv", quote = F, col.names = T, row.names = F, sep = "\t")
# plot correlation ptgs vs rddm
ggplot(exp_all) +
geom_jitter(aes(log2(ptgs), log2(rddm)), color="black", size=1) +
labs(x= "21/22-nt sRNA log2RPM", y="24-nt sRNA log2RPM") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
ggsave("results/ptgs_vs_rddm.png", width = 6, height = 4, dpi = 600)
# plot correlation ptgs vs sbge
ggplot(exp_all) +
geom_jitter(aes(log2(mean_exp), log2(ptgs)), color="black", size=1) +
facet_wrap(~comparison, ncol = 2, scales = "free_x") +
labs(x= "mRNA log2TPM", y="21/22-nt sRNA log2RPM") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
aspect.ratio=1)
ggsave("results/ptgs_vs_sbge.png", width = 8, height = 4, dpi = 600)
# plot correlation rddm vs sbge
ggplot(exp_all) +
geom_jitter(aes(log2(mean_exp), log2(rddm)), color="black", size=1) +
facet_wrap(~comparison, ncol = 2, scales = "free_x") +
labs(x= "mRNA log2TPM", y="24-nt sRNA log2RPM") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
aspect.ratio=1)
ggsave("results/rddm_vs_sbge.png", width = 8, height = 4, dpi = 600)
# Plot comparison Female vs Male sRNA mapping PTGS
ggplot(exp_all) +
geom_violin(aes(x=bias, y=log2(ptgs), fill=sex), color="white") +
geom_boxplot(aes(x=bias, y=log2(ptgs), fill=sex), color="gray4", alpha=0.2, notch = T, position = position_dodge(width=0.9)) +
facet_wrap(~tissue) +
labs(x="Gene bias", y="21/22-nt sRNA log2RPM", fill="Sex expression") +
scale_fill_manual(values = met.brewer("Derain")) +
theme_bw() +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Removed 215 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 215 rows containing non-finite values (`stat_boxplot()`).
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
ggsave("results/ptgs_vs_sbge_sex.png", width = 10, height = 6, dpi = 600)
## Warning: Removed 215 rows containing non-finite values (`stat_ydensity()`).
## Removed 215 rows containing non-finite values (`stat_boxplot()`).
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
# Plot comparison Female vs Male sRNA mapping RdDM
ggplot(exp_all) +
geom_violin(aes(x=bias, y=log2(rddm), fill=sex), color="white") +
geom_boxplot(aes(x=bias, y=log2(rddm), fill=sex), color="gray4", alpha=0.2, notch = T, position = position_dodge(width=0.9)) +
facet_wrap(~tissue) +
labs(x="Gene bias", y="24-nt sRNA log2RPM", fill="Sex expression") +
scale_fill_manual(values = met.brewer("Derain")) +
theme_bw() +
theme(legend.position = "top",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
## Warning: Removed 80 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 80 rows containing non-finite values (`stat_boxplot()`).
ggsave("results/rddm_vs_sbge_sex.png", width = 10, height = 6, dpi = 600)
## Warning: Removed 80 rows containing non-finite values (`stat_ydensity()`).
## Removed 80 rows containing non-finite values (`stat_boxplot()`).
rm(exp_all)
# Flowers
# none of them are in the Y chr
pi_flo = drop_na(read.table("popgen/pi_flower_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_flo) = c("chr", "start", "end", "pi")
bias_flowers = read.table("results/differential_expression/sex_biased_flowers_sbge.bed")[, c(1:4,8)]
colnames(bias_flowers) = c("chr", "start", "end", "gene", "bias")
pi_flo = left_join(pi_flo, bias_flowers, by=c("chr", "start", "end"))
# load pi values from unibased genes
pi_unbiased = drop_na(read.table("popgen/pi_unbiased_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_unbiased) = c("chr", "start", "end", "pi")
pi_unbiased = pi_unbiased %>% mutate(gene = paste("un", 1:nrow(pi_unbiased), sep = "_"), bias = "unbiased")
# stats
shapiro.test(filter(pi_flo, bias=="male-biased")$pi) # not normally distributed
##
## Shapiro-Wilk normality test
##
## data: filter(pi_flo, bias == "male-biased")$pi
## W = 0.80766, p-value = 3.573e-12
shapiro.test(filter(pi_flo, bias=="female-biased")$pi) # not normally distributed
##
## Shapiro-Wilk normality test
##
## data: filter(pi_flo, bias == "female-biased")$pi
## W = 0.80349, p-value = 1.975e-07
shapiro.test(pi_unbiased$pi) # not normally distributed
##
## Shapiro-Wilk normality test
##
## data: pi_unbiased$pi
## W = 0.82899, p-value < 2.2e-16
var.test(x=filter(pi_flo, bias=="male-biased")$pi, y=filter(pi_flo, bias=="female-biased")$pi)
##
## F test to compare two variances
##
## data: filter(pi_flo, bias == "male-biased")$pi and filter(pi_flo, bias == "female-biased")$pi
## F = 1.4474, num df = 137, denom df = 58, p-value = 0.1117
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9162821 2.1986361
## sample estimates:
## ratio of variances
## 1.447436
# there is no significant difference between the variances
t.test(x=filter(pi_flo, bias=="male-biased")$pi, y=filter(pi_flo, bias=="female-biased")$pi, var.equal = TRUE)
##
## Two Sample t-test
##
## data: filter(pi_flo, bias == "male-biased")$pi and filter(pi_flo, bias == "female-biased")$pi
## t = 0.97987, df = 195, p-value = 0.3284
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.009452139 0.028118916
## sample estimates:
## mean of x mean of y
## 0.1604355 0.1511021
t.test(pi ~ bias, data = pi_flo, var.equal = TRUE)
##
## Two Sample t-test
##
## data: pi by bias
## t = -0.97987, df = 195, p-value = 0.3284
## alternative hypothesis: true difference in means between group female-biased and group male-biased is not equal to 0
## 95 percent confidence interval:
## -0.028118916 0.009452139
## sample estimates:
## mean in group female-biased mean in group male-biased
## 0.1511021 0.1604355
# p values not significant (0.32), no difference
wilcox.test(pi ~ bias, data=pi_flo)
##
## Wilcoxon rank sum test with continuity correction
##
## data: pi by bias
## W = 3790.5, p-value = 0.4449
## alternative hypothesis: true location shift is not equal to 0
# p values not significant (0.44), no difference
# concatenate
pi_flo = rbind(pi_flo, pi_unbiased)
# comparison to unbiased
var.test(x=filter(pi_flo, bias=="male-biased")$pi, y=pi_unbiased$pi)
##
## F test to compare two variances
##
## data: filter(pi_flo, bias == "male-biased")$pi and pi_unbiased$pi
## F = 0.76258, num df = 137, denom df = 1407, p-value = 0.04257
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6022454 0.9907896
## sample estimates:
## ratio of variances
## 0.7625785
var.test(x=filter(pi_flo, bias=="female-biased")$pi, y=pi_unbiased$pi)
##
## F test to compare two variances
##
## data: filter(pi_flo, bias == "female-biased")$pi and pi_unbiased$pi
## F = 0.52685, num df = 58, denom df = 1407, p-value = 0.00257
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3742442 0.7914290
## sample estimates:
## ratio of variances
## 0.5268477
# there is A significant difference between the variances (p=0.4 for males and 0.002 for females)
wilcox.test(x=filter(pi_flo, bias=="male-biased")$pi, y=pi_unbiased$pi, var.equal = F)
##
## Wilcoxon rank sum test with continuity correction
##
## data: filter(pi_flo, bias == "male-biased")$pi and pi_unbiased$pi
## W = 91144, p-value = 0.23
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(x=filter(pi_flo, bias=="female-biased")$pi, y=pi_unbiased$pi, var.equal = F)
##
## Wilcoxon rank sum test with continuity correction
##
## data: filter(pi_flo, bias == "female-biased")$pi and pi_unbiased$pi
## W = 36245, p-value = 0.097
## alternative hypothesis: true location shift is not equal to 0
# p values not significant (males p=0.23, females p=0.09)
# plot all
ggplot(pi_flo) +
geom_jitter(aes(x=bias, y=pi, color=bias), alpha=0.4) +
geom_violin(aes(x=bias, y=pi, fill=bias), color="white", alpha=0.6) +
geom_boxplot(aes(x=bias, y=pi, fill=bias), color="gray4", alpha=0.2, notch = T) +
labs(x="Gene bias", y="Nucleotide diversity (Ï€)") +
scale_fill_manual(values = met.brewer("Derain")[c(1,2,4)]) +
scale_color_manual(values = met.brewer("Derain")[c(1,2,4)]) +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 20),
panel.border = element_rect(color = "black", fill=NA),
aspect.ratio=1)
ggsave("results/pi.png", width = 8, height = 6, dpi = 600)
# Leaves
pi_le = drop_na(read.table("popgen/pi_leaves_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_le) = c("chr", "start", "end", "pi")
# only gene Silat_chr4_Gene.22085 is not SB in flowers too, pi=0.16 (around the mean)
bias_leaves = read.table("results/differential_expression/sex_biased_leaves_sbge.bed")[, c(1:4,8)]
colnames(bias_leaves) = c("chr", "start", "end", "gene", "bias")
pi_le = left_join(pi_le, bias_leaves, by=c("chr", "start", "end"))
rm(pi_flo, pi_le, pi_unbiased, bias_flowers, bias_leaves)